import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB  # 特征：词的次数
import joblib

data = pd.read_csv("../ML_data/数据评论.csv",
                   encoding='ANSI')

train = data.head(10).copy()  # 前10条训练


# 1. 文本内容进行分词
def split_text(val):
    return " ".join(list(jieba.cut(val)))


train["内容 "] = train["内容 "].transform(split_text)

# 2. 词频向量化

# 加载停用词
stop_words = []
with open("../ML_data/stopWord.txt", encoding='utf-8') as f:
    for line in f.readlines():
        # print(line.strip())
        stop_words.append(line.strip())

cnt = CountVectorizer(stop_words=stop_words)
cnt.fit(train["内容 "])  # 找出词的特征名称

X_train = cnt.transform(train["内容 "]).toarray()  # 训练集的特征
print("特征名称", cnt.get_feature_names())

# 将词频向量化的 过程 保存下来
joblib.dump(cnt, "词频向量化.model")

y_train = train["评价"]

print("训练集特征\n", X_train, X_train.shape)
print("训练集标签\n", y_train, y_train.shape)


nb = MultinomialNB()

nb.fit(X_train, y_train)

# 将算法模型保存
joblib.dump(nb, "数据分类_贝叶斯.model")